---
title: "Data munging"
author: "Jae Yeon Kim"
date: "`r Sys.Date()`"
output:
  html_document:
    df_print: paged
---

# Load packages 

```{r}
if (!require(pacman)) install.packages("pacman")

pacman::p_load(tidyverse, # tidyverse 
               here, # computational reproducibility
               glue, # gluing objects and strings 
               tidylog, # logging analysis
               readxl, 
               naniar,
               zeallot,
               flextable, # table
               survey,  # weighted proportion tables
               anesrake) # creating weight variables

source(here("functions/utils.r"))
```

# Tidying 

## Load and merge data 

```{r message = FALSE}
# Check the file names
# list.files(here("raw_data"))
w1 <- read_csv(here("raw_data/wave1_raw.csv"))[-c(1:2),]
w2 <- read_csv(here("raw_data/wave2_raw.csv"))[-c(1:2),]
w3 <- read_xls(here("raw_data/w3_numeric_jul10.xls"))[-c(1:2),]
```

```{r message = FALSE}
# Codebook
codebook <- read_csv(here("raw_data/codebook_jul10.csv"))

# Check dimensions 
ncol(w1) == nrow(subset(codebook, wave == "w1"))
ncol(w2) == nrow(subset(codebook, wave == "w2"))
ncol(w3) == nrow(subset(codebook, wave == "w3"))
```

```{r}
# make unique variations of drop variable 
codebook$alt_name[grepl("drop", codebook$alt_name)] <- make.unique(codebook$alt_name[grepl("drop", codebook$alt_name)], sep = "_")
```

## Change column names 

```{r message = FALSE}
names(w1) <- subset(codebook, wave == "w1")$alt_name
names(w2) <- subset(codebook, wave == "w2")$alt_name
names(w3) <- subset(codebook, wave == "w3")$alt_name
```

## Drop "drop_" columns 

```{r}
w1 <- w1 %>% select(!contains("drop"))
w2 <- w2 %>% select(!contains("drop"))
w3 <- w3 %>% select(!contains("drop"))
```

## Add missing variables in each wave

```{r}
w1_copy <- w1
w2_copy <- w2
w3_copy <- w3

# Add missing columns between w1 and w2 to w1
w1_copy <- add_miss_cols(w1_copy, w2_copy)
w2_copy <- add_miss_cols(w2_copy, w1_copy)

# Add missing columns between w3 and w2 to w3
w3_copy <- add_miss_cols(w3_copy, w2_copy)
w2_copy <- add_miss_cols(w2_copy, w3_copy)

# Add missing columns between w3 and w1 to w3
w3_copy <- add_miss_cols(w3_copy, w1_copy)
w1_copy <- add_miss_cols(w1_copy, w3_copy)

# Check
sum(is.na(w2_copy$expdiscrim)) == nrow(w2_copy)
sum(is.na(w3_copy$expdiscrim)) == nrow(w3_copy)

w1 <- w1_copy
w2 <- w2_copy
w3 <- w3_copy
```

## Add an wave identifier 

```{r}
names(w1) <- glue("W1_{names(w1)}")
names(w2) <- glue("W2_{names(w2)}")
names(w3) <- glue("W3_{names(w3)}")
```

## Merge dataframes 

```{r}
# Inner join
w12 <- inner_join(w1, w2, by = c("W1_respondent.id" = "W2_respondent.id"))
w13 <- inner_join(w1, w3, by = c("W1_pid" = "W3_pid"))

# Remove duplicates 
w12 <- w12[!duplicated(w12$W1_respondent.id),]
w13 <- w13[!duplicated(w13$W1_pid),]
```

## Calculate attrition rate 

$\textrm{Attrition rate} = \frac{\textrm{# of pre_wave participants} - \textrm{# of post_wave participants}}{\textrm{# pre_wave participants}}$

```{r}
attr_rate <- function(pre, post) {
    out <- (pre - post)/pre 
    round(out, 2)}

glue("The attrition rate between w1 and w2 is: {attr_rate({nrow(w1)}, nrow(w12)) * 100}%")
glue("The attrition rate between w2 and w3 is: {attr_rate(nrow(w12), nrow(w13)) * 100}%")
```

The attrition rate between w1 and w2 is: 33%
The attrition rate between w2 and w3 is: 38%

## Find common questions 

```{r}
# Wave 1 and 2
common.q12 <- intersect(extract_name(w12, "W2"), extract_name(w12, "W1"))

long12 <- w12 %>% select(matches(common.q12) | W1_respondent.id) 

# Check Chinese %
mean(long12$W1_natorigin == 1)
```

```{r}
# Wave 1 and 3
common.q13 <- intersect(extract_name(w13, "W3"), extract_name(w13, "W1"))

long13 <- w13 %>%
  select(matches(common.q13) | W1_pid) 

# Check Chinese %
mean(long12$W1_natorigin == 1)
```

```{r}
# Join
long_df <- left_join(w13, w12, by = c("W1_pid" = "W1_pid"))

# Remove .y columns 
long_df <- long_df %>%
  select(!contains(".y"))

names(long_df) <- str_replace_all(names(long_df), ".x", "")

# Check Chinese %
mean(long_df$W1_natorigin == 1)
```

## Bind by rows 

```{r}
# wave1
w1 <- long_df %>%
  select(matches("W1")) %>%
  select(!matches("W2|W3"))

names(w1) <- str_replace_all(names(w1), "W1_", "")

w1$wave <- 1

# wave2 
w2 <- long_df %>%
  select(matches("W2")) %>%
  select(!matches("W1|W3"))

names(w2) <- str_replace_all(names(w2), "W2_", "")

w2$wave <- 2

# wave3 
w3 <- long_df %>%
  select(matches("W3")) %>%
  select(!matches("W1|W2"))

names(w3) <- str_replace_all(names(w3), "W3_", "")

w3$wave <- 3

# combine them

w1$respondent.id <- NULL
w2$pid <- w1$pid
w3$pid <- w1$pid

binded_df <- bind_rows(w1, w2, w3)

glue("The total # of participatns in each wave in the panel data: {nrow(binded_df)/3}")

# Check
subset(binded_df, wave == 1)$gendiscrim[1:5]
mean(binded_df$natorigin == 1, na.rm = TRUE)
```

## Final merging 

```{r}
completed_df <- binded_df %>%
  dplyr::select(!matches("response|respondent"))

# Fill in missing values using the previous wave information

# Target variables
vars <- c("race", "gender", "natorigin", "usborn")

completed_df <- fill(completed_df, race)
completed_df <- fill(completed_df, gender)
completed_df <- fill(completed_df, usborn)
```


```{r}
# 1992 obs and 105 variables 
dim(completed_df)

if (!dir.exists(here("processed_data"))) dir.create(here("processed_data"))

write.csv(completed_df, here("processed_data/panel_data.csv"))
```

# Recoding

## Recode NAs

```{r}
## Replace with NA
mean(is.na(subset(completed_df, wave != 1)$apa.discrim.rona))

df <- completed_df
```

ronaunfair = unfairly treated because of COVID 
Yes (1)
No (2)

ronaunfairasian = unfairly treated because of COVID + Asian
Very likely (1)
Somewhat likely (2)
Not very likely (3)
I'm not sure (4)

rona.apa.mistreat = witnessing COVID-19 discrimination
Yes (1)
No (2)

```{r}
df[df == 9999] <- NA

df$ronaunfairasian[df$ronaunfairasian == 4] <- NA
```

## Recode values 

```{r}
df1 <- subset(df, wave == 1) 
df23 <- subset(df, wave != 1)
```

```{r}
# Party ID
df1$party.id <- recode_party_w1(df1$party.id)
df23$party.id <- recode_party_w23(df23$party.id)

df <- bind_rows(df1, df23)

# Check 
unique(df$party.id) %in% c("Republican", "Independent", "Democrat", "Other", NA) == TRUE
```

```{r}
df12 <- subset(df, wave != 3)
df3 <- subset(df, wave == 3)

# Affect 
df12$whiteaffect <- recode_affect_w12(df12$whiteaffect)

df12$blackaffect <- recode_affect_w12(df12$blackaffect)

df12$latinoaffect <- recode_affect_w12(df12$latinoaffect)

df12$asianaffect <- recode_affect_w12(df12$asianaffect)

df <- bind_rows(df12, df3)
```

- Chinese (1)  
- Taiwanese (2)  
- Indian (3)  
- Korean (4)  
- Filipino (5)  
- Vietnamese (6)  
- Japanese (7)  
- Pakistani (8)  
- Thai (9)  
- Iranian (10)  
- Bangladeshi (11)  
- Laotian (12) 
- Cambodian (13)  
- Hmong (14)  
- Native Hawaiian (15)  
- Samoan (16)  
- Bhutanese (17)  
- Burmese (18)  
- Fijian (19)  
- Indonesian (20)  
- Malaysian (21)  
- Mongolian (22)  
- Nepalese (23)  
- Singaporean (24)  
- Sri Lankan (25)  
- Taiwanese (26)  
- Tongan (27)  
- Other Asian country (please indicate which) (28) 
- Do Not Know (29) [END OF SURVEY] 
- No Asian background (30) [END OF SURVEY] 

```{r}
# Create dummy variables: gender, usborn
c(df$male, df$usborn) %<-% map(list(df$gender, df$usborn), recode_dummy)
```

```{r}
df$all_asian_disc <- if_else(df$apa.discrim.who == 2, 1, 0)
```

```{r}
# SES
df$edu <- rep(subset(df, wave == 1)$edu, 3)
df$state <- rep(subset(df, wave == 1)$state, 3)

# National origin 
df$natorigin <- rep(subset(df, wave == 1)$natorigin, 3)

df$chinese <- if_else(df$natorigin == 1, 1, 0) 
df$indian <- if_else(df$natorigin == 3, 1, 0) 
df$korean <- if_else(df$natorigin == 4, 1, 0) 
df$japanese <- if_else(df$natorigin == 7, 1, 0) 
df$vietnamese <- if_else(df$natorigin == 7, 1, 0)
df$filipino <- if_else(df$natorigin == 7, 1, 0)
```

```{r}
df$yearborn <- as.numeric(rep(subset(df, wave == 1)$yearborn, 3))

df$age <- 2021 - df$yearborn
df$age <- ifelse(df$age > 0 & df$age < 100, df$age, NA)
```

```{r}
df$ronaunfair <- ifelse(df$ronaunfair == 1, 1, 0)
df$rona.apa.mistreat <- ifelse(df$rona.apa.mistreat == 1, 1, 0)
```

```{r}
df <- df %>%
    mutate(biden = if_else(`2020likelycand` == 1, 1, 0))

# replace NA with 0

c(df$rona.behav.sneeze, df$rona.behav.language, df$rona.behav.walk, df$rona.behav.transit, df$rona.behav.other, df$rona.behav.nochange) %<-% map(list(df$rona.behav.sneeze, df$rona.behav.language, df$rona.behav.walk, df$rona.behav.transit, df$rona.behav.other, df$rona.behav.nochange), replacena0)
```

# Extra munging

```{r}
# As numeric
df$gendiscrim <- as.numeric(df$gendiscrim)

# Reverse code 
c(df$apa.discrim.rona, df$linkedfate, df$trump.approve, df$trump.approve, df$apa.responsible, df$apa.have.rona, df$apa.harassment, df$idimport, df$asnpride, df$`2020likelyvote`, df$ronaunfairasian) %<-% map(list(df$apa.discrim.rona, df$linkedfate, df$trump.approve, df$trump.approve, df$apa.responsible, df$apa.have.rona, df$apa.harassment, df$idimport, df$asnpride, df$`2020likelyvote`, df$ronaunfairasian), reverse)
```

```{r}
# Normalize 
c(df$apa.discrim.rona, df$gendiscrim, df$linkedfate, df$trump.approve, df$age, df$edu, df$apa.responsible, df$apa.have.rona, df$apa.harassment, df$idimport, df$asnpride, df$`2020likelyvote`, df$ronaunfairasian, df$income) %<-% map(list(df$apa.discrim.rona, df$gendiscrim, df$linkedfate, df$trump.approve, df$age, df$edu, df$apa.responsible, df$apa.have.rona, df$apa.harassment, df$idimport, df$asnpride, df$`2020likelyvote`, df$ronaunfairasian, df$income), rescale01)

# Factorize (categorical variables)
df$party.id <- factor(df$party.id)
df$state <- factor(df$state)

subset(df, wave == 2)$gendiscrim[1:5]
subset(df, wave == 2)$`2020likelyvote`[1:5]
```

```{r}
# Add factor version wave variable
df$wave_fac <- recode(as.character(df$wave), 
       "1" = "February",
       "2" = "May",
       "3" = "November") 

df$wave_fac <- factor(df$wave_fac, levels = c("February", "May", "November"))

# Check
subset(df, wave == 2)$gendiscrim[1:5]
subset(df, wave == 2)$`2020likelyvote`[1:5]

# Additional dummies 
df$DEM <- if_else(df$party.id == "Democrat", 1, 0)
df$DEM.strong <- if_else(df$party.id == "Democrat" & df$partystr == 1, 1, 0)

df$GOP <- if_else(df$party.id == "Republican", 1, 0)
df$GOP.strong <- if_else(df$party.id == "Republican" & df$partystr == 1, 1, 0)

# Add independent and non partisans
df$independent <- ifelse(df$party.id == "Independent", 1, 0)

df$indi_other <- ifelse(df$party.id %in% c("Democrat", "Republican") | is.na(df$party.id), 0, 1)

df$male <- rep(subset(df, wave == 1)$male, 3)
df$usborn <- rep(subset(df, wave == 1)$usborn, 3)
```

# Create weights 

```{r}
# 2020 estimates from our survey data 
# Chinese - 34% 
# Indian - 15%
# Foreign born rate - 46%
# Female - 47%

# 2020 estimates from the US Census
# Chinese - 24%
# Indian - 21%
# Foreign born rate - 60%
# Female - 53%

df_unweighted <- svydesign(
    id = ~1, 
    data = df
)

# population-level distributions 

chinese_dist <- data.frame(chinese = c(1, 0),
                           Freq = nrow(df) * c(0.24, 0.76))

indian_dist <- data.frame(indian = c(1, 0),
                          Freq = nrow(df) * c(0.21, 0.79))

usborn_dist <- data.frame(usborn = c(1, 0),
                          Freq = nrow(df) * c(0.4, 0.6))

# raking 

df_raked <- rake(design = df_unweighted,
                 sample.margins = list(~chinese, ~indian, ~usborn),
                 population.margins = list(chinese_dist, indian_dist, usborn_dist))

# summarize weights 
summary(weights(df_raked))

# add weights 
df$weights <- weights(df_raked)
```

# Export data 

```{r}
write.csv(df, here("processed_data", "panel_data_recoded.csv"))
```